Predict sales prices and practice feature engineering, RFs, and gradient boosting

rm(list=ls())
PROJ_PATH <- '~/Documents/kaggle/house_prices'

load(file.path(PROJ_PATH, 'data/house_prices.Rdata'))

# Plotting functions
hist2 <- function(..., breaks=30, col='darkgray', xlab=NULL){
  hist(..., breaks=breaks, col=col, border=col, xlab=xlab)
}

barplot2 <- function(..., col='darkgray', xlab=NULL){
  barplot(..., col=col, border=col, horiz=TRUE)
}

plot2 <- function(..., col=adjustcolor('gray30', alpha.f=0.2), bty='n'){
  plot(..., col=col, bty=bty)
}

Data description

dim(train)
## [1] 1460   81
names(train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"

Histograms of numeric variables

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in num_idx){
  hist2(train[[ni]], main=nm[ni])
}

hist2(train[['SalePrice']], main='SalePrice')

Frequencies of categorical variables

par(las=2)
par(mar=c(3,4,3,3))
par(mfrow = c(1, 4))

for (ci in cat_idx){
  barplot2(table(train[[ci]]), main=nm[ci])
}

Missingness of variables (only those with any values missing)

missing_perc <- sort(sapply(train, function(x) sum(is.na(x)) / length(x)))

par(las=2)
par(mar=c(3,7,2,2))
barplot2(missing_perc[missing_perc>0],
         cex.names=0.6, cex.axis=0.6, xlim=c(0,1),
         main='Missingness')

Relation of each variable to price

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in num_idx){
  plot2(train[,c(nm[ni], 'SalePrice'),],
       main=nm[ni], ylim=range(train$SalePrice))
}

par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))

for (ni in cat_idx){
  plot2(factor(train[[ni]]), train$SalePrice,
       main=nm[ni], ylim=range(train$SalePrice),
       frame=FALSE)
}

Top correlated numeric variables (pairwise-complete observations)

cor_mat <- cor(train[,num_idx], use='pairwise.complete')
cor_mat[!upper.tri(cor_mat)] <- NA

not_na_mat <- !is.na(as.matrix(train[,num_idx]))
pairwise_completeness <- t(not_na_mat) %*% not_na_mat / nrow(train)

cor_df <- data.frame(x1=rep(colnames(cor_mat), each=ncol(cor_mat)),
                     x2=rep(colnames(cor_mat), times=ncol(cor_mat)),
                     cor=as.vector(cor_mat),
                     perc_complete=as.vector(pairwise_completeness))

cor_df <- cor_df[!is.na(cor_df$cor),]
cor_df <- cor_df[order(abs(cor_df$cor), decreasing=TRUE),]
knitr::kable(cor_df[abs(cor_df$cor) > 0.50,], row.names=FALSE)
x1 x2 cor perc_complete
GarageArea GarageCars 0.8824754 1.0000000
GarageYrBlt YearBuilt 0.8256675 0.9445205
TotRmsAbvGrd GrLivArea 0.8254894 1.0000000
X1stFlrSF TotalBsmtSF 0.8195300 1.0000000
GrLivArea X2ndFlrSF 0.6875011 1.0000000
TotRmsAbvGrd BedroomAbvGr 0.6766199 1.0000000
BsmtFullBath BsmtFinSF1 0.6492118 1.0000000
GarageYrBlt YearRemodAdd 0.6422768 0.9445205
FullBath GrLivArea 0.6300116 1.0000000
TotRmsAbvGrd X2ndFlrSF 0.6164226 1.0000000
HalfBath X2ndFlrSF 0.6097073 1.0000000
GarageCars OverallQual 0.6006707 1.0000000
GrLivArea OverallQual 0.5930074 1.0000000
YearRemodAdd YearBuilt 0.5928550 1.0000000
GarageCars GarageYrBlt 0.5889200 0.9445205
YearBuilt OverallQual 0.5723228 1.0000000
GrLivArea X1stFlrSF 0.5660240 1.0000000
GarageArea GarageYrBlt 0.5645671 0.9445205
GarageArea OverallQual 0.5620218 1.0000000
TotRmsAbvGrd FullBath 0.5547843 1.0000000
YearRemodAdd OverallQual 0.5506839 1.0000000
FullBath OverallQual 0.5505997 1.0000000
GarageYrBlt OverallQual 0.5477658 0.9445205
GarageCars YearBuilt 0.5378501 1.0000000
TotalBsmtSF OverallQual 0.5378085 1.0000000
TotalBsmtSF BsmtFinSF1 0.5223961 1.0000000
BedroomAbvGr GrLivArea 0.5212695 1.0000000
BedroomAbvGr X2ndFlrSF 0.5029006 1.0000000